# Description ------------------------------------------------------------------

### This script loads and cleans the market_only_data csv file
### and turns it into an RData object

### NOTE: Hand-coding was done on market_only_data.csv before being processed
### with this script.

# Settings ---------------------------------------------------------------------
raw_path <- 'data/1_raw/'
clean_path <- 'data/2_clean/'

# Convenience Functions --------------------------------------------------------
source("scripts/0_functions/functions_cleaning.R")

# Packages ---------------------------------------------------------------------
# if packages are not installed, they must be installed with:
# install.packages("package_name"); The name of package must be in quotes
library(dplyr)
library(stringr)
library(purrr)
library(readxl)
library(haven)
library(labelled)
library(hms)

# Data -------------------------------------------------------------------------
#vendor survey endline
load(paste0(clean_path, "vendor_end.RData"))
#vendor survey baseline
load(paste0(clean_path, "vendor_base.RData"))
#tax collector survey endline
load(paste0(clean_path, "tax_collector_end.RData"))
#tax collector survey baseline
load(paste0(clean_path, "tax_collector_base.RData"))
# cleaned market month data
load(paste0(clean_path, "month_data_merged.RData"))
#raw market_only_data
market_only_data <- readxl::read_xlsx(paste0(raw_path, "market_only_data.xlsx"))


################################################################################
# Cleaning ---------------------------------------------------------------------
################################################################################
#SMS retention rate = number of people in a market remained signed up to receive
#SMS in last SMS month, divided by how many people signed up in kickoff meeting
market_only_data <- market_only_data %>% 
  mutate(sms_retention_rate = SMS_Total_End/SMS_Total)

#rename Treatment treatment status variable to make it more compatible
#with other datasets from this project
## Note: Treatment is retained for possible compatibility issues
market_only_data <- market_only_data %>% 
  mutate(treatment_status = ifelse(grepl("Top", Treatment), "TD",
                          ifelse(grepl("Bottom", Treatment), "BU",
                                 ifelse(grepl("Control", Treatment), "Control", 
                                        ifelse(grepl("Both", Treatment), 
                                               "BOTH", Treatment)))),
         Treatment = NULL) 

################################################################################
# Vendor and Tax Collector Survey Compliance Measures --------------------------
################################################################################
#vendor
comp_vendor_endline <- vendor_end %>% group_by(market) %>% 
  mutate(rev_info_system_yes = ifelse(is.na(rev_info_system),
                                      NA, ifelse(rev_info_system != "No",
                                                 1, 0))) %>% 
  summarise_at(vars(meeting_held, 
                    mkt_com_training,
                    any_constr,
                    rev_info_system_yes,
                    info_source_sms,
                    received_sms), mean, na.rm = T) %>% 
  rename(meeting_held_ven = meeting_held,
         mkt_com_training_ven = mkt_com_training,
         any_constr_ven = any_constr,
         rev_info_system_ven = rev_info_system_yes,
         info_source_sms_ven = info_source_sms)
#tax collector
comp_tc_endline <- tax_collector_end %>% group_by(market) %>%
  mutate(rev_info_system_yes = ifelse(is.na(rev_info_system),
                                      NA, ifelse(rev_info_system != "No",
                                                 1, 0))) %>% 
  summarise(meeting_held_tc = mean(meeting_held, na.rm = T),
            any_constr_tc = mean(any_constr, na.rm = T),
            rev_info_system_tc = mean(rev_info_system_yes, na.rm = T),
            info_source_sms_tc = mean(info_source_sms, na.rm = T),
            vendor_counting = mean(vendor_counting, na.rm = T),
            mobile_money = mean(mobile_money, na.rm = T),
            paid_all_commision = mean(paid_all_commision, na.rm = T),
            paid_incentives = mean(paid_incentives, na.rm = T))

#fewer markets in tc?
comp_vendor_endline$market[comp_vendor_endline$market %nin%
                             comp_tc_endline$market]
#"Mankhaka" "Mpala" missing from tc data


#merging two together
comp_endline <- full_join(comp_vendor_endline, comp_tc_endline)

#merging with market_only
market_only_data <- full_join(market_only_data,
                              comp_endline,
                              by = c("Market" = "market"))

#getting number of respondents per mkt, baseline and endline
n_pr_mkt_vb_ttl <- vendor_base %>% group_by(market) %>% 
  summarise(n_pr_mkt_vb_ttl = n())
n_pr_mkt_ve_ttl <- vendor_end %>% group_by(market) %>% 
  summarise(n_pr_mkt_ve_ttl = n())
n_pr_mkt_vb_sh <- vendor_base %>% filter(surveytype2 == "long") %>% 
  group_by(market) %>% 
  summarise(n_pr_mkt_vb_sh = n())
n_pr_mkt_ve_sh <- vendor_end %>% filter(long_survey == 1) %>% 
  group_by(market) %>% 
  summarise(n_pr_mkt_ve_sh = n())
n_pr_mkt_tcb <- tax_collector_base %>% group_by(market) %>% 
  summarise(n_pr_mkt_tcb = n())
n_pr_mkt_tce <- tax_collector_end %>% group_by(market) %>% 
  summarise(n_pr_mkt_tce = n())

n_pr_mkt <- full_join(n_pr_mkt_vb_ttl, 
                  full_join(n_pr_mkt_ve_ttl,
                        full_join(n_pr_mkt_vb_sh,
                              full_join(n_pr_mkt_ve_sh,
                                        full_join(n_pr_mkt_tcb,
                                                  n_pr_mkt_tce)))))

n_pr_mkt[is.na(n_pr_mkt)] <- 0

#merge with market only
market_only_data <- full_join(market_only_data, n_pr_mkt, 
                              by = c("Market" = "market"))

################################################################################
# Creating New Compliance Variables Based on Market Only Data-------------------
################################################################################

### BU

#Did not receive 1st choice project
market_only_data$not_1st_inf <- ifelse(market_only_data$treatment_status %in%
                                         c("BU" , "BOTH"),
                                       ifelse(market_only_data$mkt_infr_proj_priority == 1,
                                         0, 1), NA)
market_only_data$inf_not_in_prty_list <- ifelse(market_only_data$treatment_status %in%
                                         c("BU" , "BOTH"),
                                       ifelse(market_only_data$mkt_infr_proj_priority == 0,
                                              1, 0), NA)

#Market visited before infrastructure project mobilization/construction started
market_only_data <- market_only_data %>% 
  mutate(vst_bfr_inf_mob = ifelse(treatment_status %in% c("BU" , "BOTH"),
                                  ifelse(end_survey_date <= inf_mob_start,
                                         1, 0),
                                  NA),
         vst_bfr_inf_constr = ifelse(treatment_status %in% c("BU" , "BOTH"),
                                    ifelse(end_survey_date <= inf_constr_start,
                                           1, 0),
                                    NA))
#Chinsapo 2 was supposed to get electricity. It never got electricity, nor
#any infrastructure project, although this wasn't clear at endline
#but Gantt chart does not have dates, so need to add it
market_only_data[market_only_data$Market == "Chinsapo 2",
                 c("vst_bfr_inf_mob", 
                   "vst_bfr_inf_constr")] <- 1

#Project was changed during construction (dry borehole or other reason like in
#Dziwe - market shed)
market_only_data <- market_only_data %>% 
  mutate(clean_mtrls = ifelse(treatment_status %in% c("BU" , "BOTH"),
                              ifelse(mkt_infr_proj_rcvd == "CLEANING MATERIALS",
                                     1, 0),
                              NA))


### Misc 

#appointed_elections - captures which markets had appointed and active market
#committees that were then replaced by elected committees under the direction of
#LGAP
## Note: only Mulanje markets had "valid" market committees
market_only_data$appointed_elections <- ifelse(market_only_data$treatment_status %in%
                                                 c("BU", "BOTH"),
                                               ifelse(market_only_data$District == "Mulanje",
                                               0, 1), NA)

################################################################################
# Creating New Compliance Vars Based on Market Month Data
################################################################################

### BU

#grievance messages not responded to
bu_market <- market_month %>% 
  filter(treatment_status %in% c("BU", "BOTH")) %>% 
  group_by(official_name) %>%
  mutate(grievance_mgs_received = ifelse(grievance_mgs_received < 0,
                                         NA, 
                                         grievance_mgs_received)) %>% 
  summarise(grievance_mgs_total = sum(grievance_mgs_received, na.rm = T),
            grievance_resp_none_per = sum(grievance_resp_none, na.rm = T)/
              sum(!is.na(grievance_mgs_received)),
            grievance_resp_none = ifelse(grievance_resp_none_per > 0, 1, 0))

### TD 

#vendor counts delayed
td_market <- market_month %>% filter(treatment_status %in% c("TD", "BOTH"),
                        year == 2018, month %in% c("March") )  %>% 
  select(official_name, vendor_count_issue) %>% 
  mutate(vendor_count_delayed = ifelse(vendor_count_issue == 2,
                                       1,
                                       0)) %>% 
  select(-vendor_count_issue)

#vendor counts started early (active Jan or Feb 2018)
td_market <- market_month %>% filter(treatment_status %in% c("TD", "BOTH"),
                                     year == 2018, month %in% c("January",
                                                                "February"))  %>% 
  select(official_name, vendor_count_issue) %>% 
  group_by(official_name) %>% 
  summarize(vendor_count_early = sum(vendor_count_issue)) %>% 
  mutate(vendor_count_early = ifelse(vendor_count_early < 4, 
                                      1,
                                      0)) %>% 
  full_join(td_market)

#vendor count not active during intervention period
td_market <- market_month %>% filter(treatment_status %in% c("TD", "BOTH"),
                                     year == 2018, month %in% c("March",
                                                                "April",
                                                                "May",
                                                                "June",
                                                                "July",
                                                                "August",
                                                                "September",
                                                                "October"))  %>% 
  select(official_name, vendor_count_issue) %>% 
  group_by(official_name) %>% 
  summarize(vendor_count_issue = sum(vendor_count_issue)) %>% 
  mutate(vendor_count_issue = ifelse(vendor_count_issue > 0, 
                                     1,
                                     0)) %>% 
  full_join(td_market)

#lack of evidence that revenue target was communicated
td_market <- market_month %>%
  filter(treatment_status %in% c("TD", "BOTH"))  %>% 
  select(official_name, rev_targ_issue) %>% 
  mutate(rev_targ_issue = ifelse(rev_targ_issue < 0,
                                         NA, 
                                         rev_targ_issue)) %>% 
  group_by(official_name) %>% 
  summarize(rev_targ_issue = sum(rev_targ_issue, na.rm = T)) %>% 
  mutate(rev_targ_issue = ifelse(rev_targ_issue >= 2, 1, 0)) %>% 
  full_join(td_market)

#incentive given incorrectly or not given incorrectly, or each individually,
#or incentive delayed
td_market <- market_month %>%
  filter(treatment_status %in% c("TD", "BOTH"))  %>% 
  select(official_name, incentive_giv_inc, incentive_not_giv_inc,
         incentive_delayed) %>% 
  mutate(incentive_inc_all = 1*(incentive_giv_inc == 1 | incentive_not_giv_inc == 1)) %>% 
  group_by(official_name) %>% 
  summarize(incentive_inc_all = sum(incentive_inc_all, na.rm = T),
            incentive_giv_inc = sum(incentive_giv_inc, na.rm = T),
            incentive_not_giv_inc = sum(incentive_not_giv_inc, na.rm = T),
            incentive_delayed = sum(incentive_delayed, na.rm = T)) %>% 
  mutate(incentive_inc_all = ifelse(incentive_inc_all >= 1, 1, 0),
         incentive_giv_inc = ifelse(incentive_inc_all >= 1, 1, 0),
         incentive_not_giv_inc = ifelse(incentive_not_giv_inc >= 1, 1, 0),
         incentive_delayed = ifelse(incentive_delayed >= 1, 1, 0)) %>% 
  full_join(td_market)

#mobile money
td_market <- market_month %>%
  filter(treatment_status %in% c("TD", "BOTH"))  %>% 
  select(official_name, mob_mon_transf, mob_mon_trnsctns,
         market_fees_collected_cl) %>%
  mutate(mob_mon_no_trnsctns = ifelse(mob_mon_trnsctns < 1, 1, 0),
         mob_mon_coll_less = ifelse(mob_mon_transf < .5 * market_fees_collected_cl, 
                                    1, 0)) %>% 
  group_by(official_name) %>% 
  summarize(mob_mon_no_trnsctns = sum(mob_mon_no_trnsctns, na.rm = T),
            mob_mon_coll_less = sum(mob_mon_coll_less, na.rm = T)) %>% 
  mutate(mob_mon_no_trnsctns = ifelse(mob_mon_no_trnsctns >= 1, 1, 0),
         mob_mon_coll_less = ifelse(mob_mon_coll_less >= 1, 1, 0)) %>% 
  full_join(td_market)

td_market <- market_month %>%
  filter(treatment_status %in% c("TD", "BOTH"),
         year == 2018, month %in% c("June",
                                    "July",
                                    "August",
                                    "September",
                                    "October",
                                    "November",
                                    "December"))  %>%
  select(official_name, mob_mon_transf, mob_mon_trnsctns,
         market_fees_collected_cl) %>%
  mutate(mob_mon_no_trnsctns_after_May = ifelse(mob_mon_trnsctns < 1, 1, 0),
         mob_mon_coll_less_after_May = ifelse(mob_mon_transf < .5 * market_fees_collected_cl, 
                                    1, 0)) %>% 
  group_by(official_name) %>% 
  summarize(mob_mon_no_trnsctns_after_May = sum(mob_mon_no_trnsctns_after_May, na.rm = T),
            mob_mon_coll_less_after_May = sum(mob_mon_coll_less_after_May, na.rm = T)) %>% 
  mutate(mob_mon_no_trnsctns_after_May = ifelse(mob_mon_no_trnsctns_after_May >= 1, 1, 0),
         mob_mon_coll_less_after_May = ifelse(mob_mon_coll_less_after_May >= 1, 1, 0)) %>% 
  full_join(td_market)
  
#merge with market_only_data

market_only_data <- full_join(market_only_data, td_market, 
                              by = c("Market" = "official_name")) %>% 
  full_join(bu_market, 
            by = c("Market" = "official_name"))

### Creating final compliance variables --> signify if a market has compliance 
### issue; how many it is; how many TD compliance issues; how many BU compliance
#issues
market_only_data <- market_only_data %>% 
  mutate(compl_issues_full = NAto0(not_1st_inf) + #BU
           NAto0(vst_bfr_inf_mob) + #BU
           NAto0(grievance_resp_none) + #BU
           NAto0(clean_mtrls) + #BU
           NAto0(vendor_count_delayed) + #TD
           NAto0(rev_targ_issue) + #TD
           NAto0(incentive_not_giv_inc) + #TD
           NAto0(incentive_delayed) + #TD
           NAto0(mob_mon_no_trnsctns_after_May), #TD
         compl_issues_full_BU = not_1st_inf + #BU
           vst_bfr_inf_mob + #BU
           grievance_resp_none + #BU
           clean_mtrls, #BU
         compl_issues_full_TD = vendor_count_delayed + #TD
           rev_targ_issue + #TD
           incentive_not_giv_inc + #TD
           incentive_delayed + #TD
           mob_mon_no_trnsctns_after_May, #TD
         compl_issue_full = ifelse(compl_issues_full > 0, 1, 0),
         compl_issue_full_TD = ifelse(compl_issues_full_TD > 0, 1, 0),
         compl_issue_full_BU = ifelse(compl_issues_full_BU > 0, 1, 0),
         #adding compl_issue_main b/c lack of var in some compl vars
         compl_issues_main = 
           NAto0(vst_bfr_inf_mob) + #BU
           NAto0(grievance_resp_none) + #BU
           NAto0(clean_mtrls) + #BU
           NAto0(incentive_not_giv_inc) + #TD
           NAto0(incentive_delayed) + #TD
           NAto0(mob_mon_no_trnsctns_after_May), #TD
         compl_issues_main_BU = vst_bfr_inf_mob + #BU
           grievance_resp_none + #BU
           clean_mtrls, #BU
         compl_issues_main_TD = incentive_not_giv_inc + #TD
           incentive_delayed + #TD
           mob_mon_no_trnsctns_after_May, #TD
         compl_issue_main = ifelse(compl_issues_main > 0, 1, 0),
         compl_issue_main_TD = ifelse(compl_issues_main_TD > 0, 1, 0),
         compl_issue_main_BU = ifelse(compl_issues_main_BU > 0, 1, 0),
         #now creating variables to denote when a market got full treatment
         #strict version = one compliance issue means market didn't "get" treatment
         #relaxed version = three compliance issues means market didn't "get" treatment
         BU_treat_rcvd_strict = NAto0(abs(1 - compl_issue_main_BU)),
         TD_treat_rcvd_strict = NAto0(abs(1 - compl_issue_main_TD)),
         BU_treat_rcvd_relaxed = NAto0(abs(1 - (compl_issues_main_BU == 3))),
         TD_treat_rcvd_relaxed = NAto0(abs(1 - (compl_issues_main_TD == 3))),
         BOTH_rcvd_strict = BU_treat_rcvd_strict * TD_treat_rcvd_strict,
         BOTH_rcvd_relaxed = BU_treat_rcvd_relaxed * TD_treat_rcvd_relaxed,
         BU_rcvd_strict = ifelse(BU_treat_rcvd_strict == 1 &
                                   TD_treat_rcvd_strict == 0, 1, 0),
         TD_rcvd_strict = ifelse(BU_treat_rcvd_strict == 0 &
                                   TD_treat_rcvd_strict == 1, 1, 0),
         BU_rcvd_relaxed = ifelse(BU_treat_rcvd_relaxed == 1 &
                                   TD_treat_rcvd_relaxed == 0, 1, 0),
         TD_rcvd_relaxed = ifelse(BU_treat_rcvd_relaxed == 0 &
                                   TD_treat_rcvd_relaxed == 1, 1, 0)
         ) 
 

################################################################################
# Saving -----------------------------------------------------------------------
################################################################################
save(market_only_data, file = paste0(clean_path, "market_only_data.RData"))
